from selenium import webdriver
import requests
import pandas as pd
import lxml.etree as le

url = 'https://www.runoob.com/html/html-tutorial.html'  # 要爬取地址
url_head = 'https://www.runoob.com'  # 地址开头部分，后面保存链接时用

# x = "//a[@target = '_top']/@href"
x_element = "//a[@target = '_top']/{sub}"  # 提取小节对应xpath，sub内容供后面格式化使用

#获取网页内容
content = requests.get(url =url).content
contentx = le.HTML(content)
# print(contentx)

#利用xpath提取小节名称
rets = contentx.xpath(x_element.format(sub='text()'))
name=[i.strip().replace(' ','') for i in rets]

#利用xpath提取小节名称对应链接地址
rets_urls = contentx.xpath(x_element.format(sub='@href'))
urls = [ url_head+i for i in rets_urls]

#转变为dataframe类型并保存到excel文件
df_result = pd.DataFrame()
df_result['name'] = name
df_result['urls'] = urls
print(df_result)

df_result.to_excel('./阶段5模块6-作业结果.xls')
